# Load libraries
library(RCurl)
library(tidyverse)
library(magrittr)
library(plotly)
library(imputeTS)
library(lubridate)
library(timetk)
library(ggcorrplot)
# Load data (relative file path)
df <- read.csv("../data/energy_dataset.csv")
# Formatting time variable to datetime object
# NB: Time contains both UTC+1 and UTC+2 time-zones - date functions automatically pick this up and convert to UTC
df$time <- as_datetime(df$time)
head(df)
dim(df) # Number of rows and columns
## [1] 35064 29
summary(df) # Distribution and missingness for each variable
## time generation.biomass
## Min. :2014-12-31 23:00:00 Min. : 0.0
## 1st Qu.:2016-01-01 04:45:00 1st Qu.:333.0
## Median :2016-12-31 10:30:00 Median :367.0
## Mean :2016-12-31 10:30:00 Mean :383.5
## 3rd Qu.:2017-12-31 16:15:00 3rd Qu.:433.0
## Max. :2018-12-31 22:00:00 Max. :592.0
## NA's :19
## generation.fossil.brown.coal.lignite generation.fossil.coal.derived.gas
## Min. : 0.0 Min. :0
## 1st Qu.: 0.0 1st Qu.:0
## Median :509.0 Median :0
## Mean :448.1 Mean :0
## 3rd Qu.:757.0 3rd Qu.:0
## Max. :999.0 Max. :0
## NA's :18 NA's :18
## generation.fossil.gas generation.fossil.hard.coal generation.fossil.oil
## Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 4126 1st Qu.:2527 1st Qu.:263.0
## Median : 4969 Median :4474 Median :300.0
## Mean : 5623 Mean :4256 Mean :298.3
## 3rd Qu.: 6429 3rd Qu.:5839 3rd Qu.:330.0
## Max. :20034 Max. :8359 Max. :449.0
## NA's :18 NA's :18 NA's :19
## generation.fossil.oil.shale generation.fossil.peat generation.geothermal
## Min. :0 Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0 1st Qu.:0
## Median :0 Median :0 Median :0
## Mean :0 Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0 Max. :0
## NA's :18 NA's :18 NA's :18
## generation.hydro.pumped.storage.aggregated
## Mode:logical
## NA's:35064
##
##
##
##
##
## generation.hydro.pumped.storage.consumption
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 68.0
## Mean : 475.6
## 3rd Qu.: 616.0
## Max. :4523.0
## NA's :19
## generation.hydro.run.of.river.and.poundage generation.hydro.water.reservoir
## Min. : 0.0 Min. : 0
## 1st Qu.: 637.0 1st Qu.:1077
## Median : 906.0 Median :2164
## Mean : 972.1 Mean :2605
## 3rd Qu.:1250.0 3rd Qu.:3757
## Max. :2000.0 Max. :9728
## NA's :19 NA's :18
## generation.marine generation.nuclear generation.other
## Min. :0 Min. : 0 Min. : 0.00
## 1st Qu.:0 1st Qu.:5760 1st Qu.: 53.00
## Median :0 Median :6566 Median : 57.00
## Mean :0 Mean :6264 Mean : 60.23
## 3rd Qu.:0 3rd Qu.:7025 3rd Qu.: 80.00
## Max. :0 Max. :7117 Max. :106.00
## NA's :19 NA's :17 NA's :18
## generation.other.renewable generation.solar generation.waste
## Min. : 0.00 Min. : 0 Min. : 0.0
## 1st Qu.: 73.00 1st Qu.: 71 1st Qu.:240.0
## Median : 88.00 Median : 616 Median :279.0
## Mean : 85.64 Mean :1433 Mean :269.5
## 3rd Qu.: 97.00 3rd Qu.:2578 3rd Qu.:310.0
## Max. :119.00 Max. :5792 Max. :357.0
## NA's :18 NA's :18 NA's :19
## generation.wind.offshore generation.wind.onshore forecast.solar.day.ahead
## Min. :0 Min. : 0 Min. : 0
## 1st Qu.:0 1st Qu.: 2933 1st Qu.: 69
## Median :0 Median : 4849 Median : 576
## Mean :0 Mean : 5464 Mean :1439
## 3rd Qu.:0 3rd Qu.: 7398 3rd Qu.:2636
## Max. :0 Max. :17436 Max. :5836
## NA's :18 NA's :18
## forecast.wind.offshore.eday.ahead forecast.wind.onshore.day.ahead
## Mode:logical Min. : 237
## NA's:35064 1st Qu.: 2979
## Median : 4855
## Mean : 5471
## 3rd Qu.: 7353
## Max. :17430
##
## total.load.forecast total.load.actual price.day.ahead price.actual
## Min. :18105 Min. :18041 Min. : 2.06 Min. : 9.33
## 1st Qu.:24794 1st Qu.:24808 1st Qu.: 41.49 1st Qu.: 49.35
## Median :28906 Median :28901 Median : 50.52 Median : 58.02
## Mean :28712 Mean :28697 Mean : 49.87 Mean : 57.88
## 3rd Qu.:32263 3rd Qu.:32192 3rd Qu.: 60.53 3rd Qu.: 68.01
## Max. :41390 Max. :41015 Max. :101.99 Max. :116.80
## NA's :36
There are eight columns that contain no values (all NAs or zeroes). These are removed here.
df <- df %>%
select(-c(generation.fossil.coal.derived.gas,
generation.fossil.oil.shale,
generation.fossil.peat,
generation.geothermal,
generation.marine,
generation.hydro.pumped.storage.aggregated,
generation.wind.offshore,
forecast.wind.offshore.eday.ahead))
colSums(is.na(df)) # Number of missing values for each remaining variable
## time
## 0
## generation.biomass
## 19
## generation.fossil.brown.coal.lignite
## 18
## generation.fossil.gas
## 18
## generation.fossil.hard.coal
## 18
## generation.fossil.oil
## 19
## generation.hydro.pumped.storage.consumption
## 19
## generation.hydro.run.of.river.and.poundage
## 19
## generation.hydro.water.reservoir
## 18
## generation.nuclear
## 17
## generation.other
## 18
## generation.other.renewable
## 18
## generation.solar
## 18
## generation.waste
## 19
## generation.wind.onshore
## 18
## forecast.solar.day.ahead
## 0
## forecast.wind.onshore.day.ahead
## 0
## total.load.forecast
## 0
## total.load.actual
## 36
## price.day.ahead
## 0
## price.actual
## 0
16 out of 22 of the remaining variables each have a maximum of 36 missing values. Looking at where these occur, we see that most missing values occur across variables. Specifically, 90% of the missing values occur over only 18 rows. The remaining 10% are spread across 28 rows.
# Rows where data is missing
df[!complete.cases(df),]
# Number of NAs per row
missing_per_row <- rowSums(is.na(df[!complete.cases(df),]))
missing_per_row
## 100 109 110 111 112 113 114 452 453 644 662 752 753
## 14 15 15 15 15 15 15 14 14 14 15 1 1
## 754 757 758 759 760 761 762 763 764 2259 2529 2624 2709
## 1 1 1 1 1 1 1 1 1 1 15 1 15
## 2914 3555 3969 6584 6587 8050 11237 11525 11527 11903 12673 13342 13392
## 1 1 14 1 15 15 1 1 1 1 1 14 1
## 15273 15983 16613 25165 25172 30186 30897
## 1 1 1 1 1 1 15
sum(missing_per_row > 1)
## [1] 18
sum(missing_per_row[missing_per_row > 1])/sum(missing_per_row)
## [1] 0.9041096
We can visualise where in the series the missing values occur using the following function:
ggplot_na_distribution(df$generation.biomass)
However, this only allows us to look at one variable at a time. There are other functions in the imputeTS package which can be useful for investigating missingness in a (univariate) time series.
We have decided to interpolate the missing values using linear interpolation. The only places where this might be an issue is where are two gaps of length 6 and 8. These will be visualised below.
# Interpolate missing values using linear interpolation
df_clean <- df %>% mutate(across(generation.biomass:price.actual, .fns = na_interpolation, option = 'linear'))
# Verifying missing values have been filled
df_clean[!complete.cases(df_clean),]
Visualising interpolation
# Creating a 'missing' flag to indicate imputation
df_clean$interp_biomass <- as.integer(is.na(df$generation.biomass))
ggplot(data = df_clean[90:150,]) + geom_line(aes(x=time, y = generation.biomass, colour = interp_biomass), show.legend = FALSE) + scale_color_gradient(low="black", high="red") + labs(title = "Example of linear interpolation (in red) applied to the dataset")
What are we left with in df_clean?
35,000 obs across 20 variables, hourly from start 2015 to end 2018:
Energy generation from 14 different sources
Forecast onshore wind and solar, one day ahead
Forecast and actual total electricity load
Forecast and actual electricity price
And in the other dataset, not yet loaded: hourly weather data over the same period for a number of Spanish cities
gen_data <- df_clean %>%
select(c(1, starts_with("generation")))
# Giving vars simpler names
names(gen_data) <- c("time",
"Biomass",
"Lignite",
"Gas",
"Hard Coal",
"Oil",
"Hydro Pumped",
"Hydro River",
"Hydro Reservoir",
"Nuclear",
"Other",
"Other Renewable",
"Solar",
"Waste",
"Wind") #Renaming from Wind Onshore as no other wind var
# Converting to long form, grouping
long_gen_data <- gen_data %>%
pivot_longer(cols = -time, names_to = "generation_source") %>%
group_by(generation_source)
head(long_gen_data)
Getting statistics over entire dataset for each source
overall_stats <- long_gen_data %>%
summarise(sum = sum(value, na.rm = TRUE),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
max = max(value, na.rm = TRUE)) %>%
mutate(generation_source= fct_reorder(generation_source, desc(sum)))
sources_by_sum <- arrange(overall_stats, desc(sum))
sources_by_sum
ggplot(data = overall_stats) + geom_col(aes(x = generation_source, y = sum)) + theme(axis.text.x = element_text(angle=90)) + labs(title = "Total electricity generated by source")
sources_by_sum %>% pivot_longer(cols = c(sum, mean, sd, max), names_to = "stat",
values_to = "value") %>%
filter(stat != "sum") %>%
ggplot() + geom_col(aes(x = generation_source, y = value, fill = stat), position = "dodge") + theme(axis.text.x = element_text(angle=90)) + labs(title = "Other overall hourly statistics of each source")
These plots show that nuclear generation has produced the most electricity over the time series, and has the highest mean hourly value. Nuclear is also a very steady generation method, having the smallest standard deviation out of the top six generation sources. Gas has the second highest total and mean generation, and the highest maximum value in an hour. Wind, the third largest contributor, is the most variable generation source, and has the second highest maximum hourly value.
Using this information to re-factor the generation_source variable, so that plots will order the sources from most to least important.
long_gen_data$generation_source <- factor(long_gen_data$generation_source, levels = sources_by_sum$generation_source)
Smoothing the data on a daily scale
gen_data_smoothed <- long_gen_data %>%
summarise_by_time(time, .by = 'week', adjusted = mean(value))
head(gen_data_smoothed)
Looking at each series over time in isolation, with quarterly smoothing (This function is nice as it includes an interactive feature)
gen_data_smoothed %>%
filter(generation_source %in% sources_by_sum$generation_source[1:6]) %>%
plot_time_series(time, adjusted, .facet_ncol = 3,
.smooth_period = "12 months", .facet_scales='fixed',
.title= "Weekly averages for six biggest generation sources + 12-month smoothing")
gen_data_smoothed %>%
filter(generation_source %in% sources_by_sum$generation_source[7:14]) %>%
plot_time_series(time, adjusted, .facet_ncol = 3,
.smooth_period = "12 months", .facet_scales='fixed',
.title= "Weekly averages for six smallest generation sources + 12-month smoothing")
These plots emphasise the relatively steady week-to-week electricity generation from nuclear over the time period, compared with the high variability in wind and gas (note that each row has the same y-axis). Solar shows a strong seasonal pattern within each year, peaking in July (summer) and reaching a low in December (winter) each year. All hydro sources appears to show seasonality on a two-year scale.
In terms of long-term trends, biomass, oil, and “other” all show a similar decrease in production at the start of 2016, followed by a steady period until the end of the series. Meanwhile, Waste and Other Renewable show a similar increasing trend over the four year period. Gas increases slightly, whereas Hard Coal decreased slightly. Nuclear, Wind, Solar, Lignite, and all Hydro sources are steady.
long_gen_data %>%
summarise_by_time(time, .by = '3 months', adjusted = mean(value)) %>%
ggplot() +
geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity")
# Proportions
gen_data_props <- cbind(gen_data[1], prop.table(as.matrix(gen_data[-1]), margin = 1))
long_gen_data_props <- gen_data_props %>%
pivot_longer(cols = -time, names_to = "generation_source")
# Ordering sources by importance
long_gen_data_props$generation_source <- factor(long_gen_data_props$generation_source, levels = sources_by_sum$generation_source)
long_gen_data_props %>%
group_by(generation_source) %>%
summarise_by_time(time, .by = '3 months', adjusted = mean(value)) %>%
ggplot() +
geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity") +
theme_minimal()
# Correlation plot. Sample size for computational efficiency.
df_corr <- gen_data[1:10000,2:ncol(gen_data)]
corrmatrix <- df_corr %>% cor()
ggcorrplot(corrmatrix,
hc.order = TRUE,
type = "lower",
lab = FALSE)
# Extract individual correlations using the following:
corrmatrix['Hard Coal', 'Hydro Pumped']
## [1] -0.5379619
corrmatrix['Hard Coal', 'Wind']
## [1] -0.5821862
corrmatrix['Hard Coal', 'Lignite']
## [1] 0.8355603
corrmatrix['Hydro River', 'Hydro Reservoir']
## [1] 0.6465254
The most positive correlation is between Hard Coal and Lignite (0.83), followed by Hydro River and Hydro Reservoir (0.65). This makes sense given the similarity of these sources. The most negative correlation is between Hard Coal and Wind (-0.58), followed by Hard Coal and Hydro Pumped (-0.54). These relationships could reflect the variability in the generation by the renewable sources (Wind and Hydro, see figures …), and the need to compensate with fossil fuels when generation is low. These relationships are further examined in the plots below.
long_gen_data[1:10000,] %>% filter(generation_source %in% c('Wind', 'Hard Coal','Lignite')) %>% ungroup() %>% plot_time_series(time, value, .color_var = generation_source, .smooth = 0)
long_gen_data[1:10000,] %>% filter(generation_source %in% c('Hydro Reservoir', 'Hydro River')) %>% ungroup() %>% plot_time_series(time, value, .color_var = generation_source, .smooth = 0)
# Create time variables
long_gen_data$year <- year(long_gen_data$time)
long_gen_data$month <- month(long_gen_data$time)
long_gen_data$day <- day(long_gen_data$time)
long_gen_data$hour <- hour(long_gen_data$time)
long_gen_data_props$year <- year(long_gen_data_props$time)
long_gen_data_props$month <- month(long_gen_data_props$time)
long_gen_data_props$day <- day(long_gen_data_props$time)
long_gen_data_props$hour <- hour(long_gen_data_props$time)
Seasonality within a day
hourly_generation_plot_by_year <- long_gen_data %>%
ggplot(aes(x = hour, y = value, fill = generation_source)) +
geom_bar(stat = "identity")
hourly_generation_plot_by_year
# Proportions
hourly_generation_plot_by_year_props <- long_gen_data_props %>%
ggplot(aes(x = hour, y = value, fill = generation_source)) +
geom_bar(stat = "identity")
hourly_generation_plot_by_year_props
Seasonality within a month
daily_generation_plot_by_year <- long_gen_data %>%
ggplot(aes(x = day, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year) + theme(legend.position = "none")
daily_generation_plot_by_year
# Note: Similar problem to raw values
daily_generation_plot_by_year_props <- long_gen_data_props %>%
ggplot(aes(x = day, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year) + theme(legend.position = "none")
daily_generation_plot_by_year_props
Seasonality within a year
monthly_generation_plot <- long_gen_data %>%
ggplot(aes(x = month, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year)
monthly_generation_plot
# Note: Shorter bars for shorter months as above
monthly_generation_plot_props <- long_gen_data_props %>%
ggplot(aes(x = month, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year)
monthly_generation_plot_props
datain <- long_gen_data %>% group_by(generation_source, year, day, month) %>% summarise(across(c(value), c(max, min)))
## `summarise()` has grouped output by 'generation_source', 'year', 'day'. You can override using the `.groups` argument.
datain <- datain %>%
mutate(date = make_date(year, month, day))
head(datain)
names <- unique(datain$generation_source)
for(name in names){
datain2 <- filter(datain, generation_source == name)
print(datain2 %>% ggplot()+geom_line(aes(x = date, y = (value_1 - value_2)/((value_1+value_2)/2) , color = generation_source))+theme(legend.position="bottom") + ggtitle(name)
)
}
## Warning: Removed 3 row(s) containing missing values (geom_path).
datain <- long_gen_data %>% group_by(generation_source, year, day, month) %>% summarise(across(c(value), c(max, min)))
## `summarise()` has grouped output by 'generation_source', 'year', 'day'. You can override using the `.groups` argument.
datain <- datain %>%
mutate(date = make_date(year, month, day))
head(datain)
names <- unique(datain$generation_source)
for(name in names){
datain2 <- filter(datain, generation_source == name)
print(datain2 %>% ggplot()+geom_line(aes(x = date, y = value_1, color = "max"))+geom_line(aes(x = date, y = value_2, color = "min"))+theme(legend.position="bottom")+ggtitle(name)
)
}
long_gen_data[is.na(long_gen_data)] <- mean(long_gen_data$value)
datain <- long_gen_data %>% group_by(generation_source, year, month) %>% summarise(across(c(value), c(max, min)))
## `summarise()` has grouped output by 'generation_source', 'year'. You can override using the `.groups` argument.
datain2 <- datain %>%
mutate(date = make_date(year, month))
head(datain2)
names <- unique(datain2$generation_source)
for(name in names){
datain3 <- filter(datain2, generation_source == name)
print(datain3 %>% ggplot()+geom_line(aes(x = date, y = value_1, color = "max"))+geom_line(aes(x = date, y = value_2, color = "min"))+theme(legend.position="bottom")+ggtitle(name))
}